{
  "model": [
    {
      "name": "model1"
    },
    {
      "name": "model2"
    }
  ],
  "dataset": [
    {
      "name": "mmlu",
      "metrics": [
        "first_token_accuracy",
        "single_choice_accuracy",
        "perplexity",
        "ppl_score",
        "ppl_score_over_choices"
      ]
    },
    {
      "name": "cmmlu",
      "metrics": [
        "first_token_accuracy",
        "single_choice_accuracy",
        "perplexity",
        "ppl_score",
        "ppl_score_over_choices"
      ]
    },
    {
      "name": "agieval",
      "metrics": [
        "first_token_accuracy",
        "single_choice_accuracy",
        "multi_choice_accuracy",
        "math_equivalence",
        "perplexity",
        "ppl_score_over_choices",
        "ppl_score"
      ]
    },
    {
      "name": "gaokaobench",
      "metrics": [
        "first_token_accuracy",
        "single_choice_accuracy",
        "multi_choice_accuracy",
        "math_equivalence",
        "rouge_score",
        "rouge_zh_score",
        "perplexity",
        "ppl_score_over_choices",
        "ppl_score"
      ]
    }
  ]
}
